Week 2: Data Visualization

PSC 8101 Lab, Wed., Sept. 4, 2024

Outline

  1. Data visualization
  2. Data wrangling and the Tidyverse
  3. Look over Assignment 1

Load Packages You Already Have Installed

library(tidyverse) #ggplot, readr, dplyer, etc.
library(haven) #opening Stata datasets
library(skimr) #descriptive statistics
library(nycflights13) #load datasets from moderndive book

states <- read_dta("states.dta")
  • Note how line numbers match the line numbers in the .R script

Install and Load Packages You Don’t Have Installed

#install.packages("scales")
library(scales) #for labeling axes with %, $, etc.
#install.packages("ggridges")
library(ggridges)
#install.packages("moderndive")
library(moderndive)
#install.packages("ggrepel")
library(ggrepel)
#install.packages("questionr")
library(questionr) #for frequency tables

Load Data

nes <- read_dta("nes.dta")
states <- read_dta("states.dta")
vdem <- read_dta("vdem.dta")

Some Basic Syntax for Variables

# Calling up variables in datasets: $ method, "data$variable"
skim(vdem$v2x_polyarchy)

Some Basic Syntax for Variables

Data summary
Name vdem$v2x_polyarchy
Number of rows 179
Number of columns 1
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
data 0 1 0.52 0.25 0.02 0.29 0.52 0.75 0.91 ▃▆▆▆▇

Some Basic Syntax for Variables

# For skimr, you can also use use (data, variable)
skim(vdem, v2x_polyarchy)

Some Basic Syntax for Variables

Data summary
Name vdem
Number of rows 179
Number of columns 4171
_______________________
Column type frequency:
numeric 1
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
v2x_polyarchy 0 1 0.52 0.25 0.02 0.29 0.52 0.75 0.91 ▃▆▆▆▇

Some Basic Syntax for Variables

# Frequency table, using questionr package: 
freq(vdem$v2x_regime)
                         n    % val%
[0] Closed Autocracy    25 14.0 14.0
[1] Electoral Autocracy 64 35.8 35.8
[2] Electoral Democracy 54 30.2 30.2
[3] Liberal Democracy   36 20.1 20.1

Histograms

  • Univariate display summarizing the spread and distribution of a single interval variable.
  • A graphical frequency distribution for an interval/continuous variable.
# We'll use the "weather" data from nycflights13 package; 
# Hourly meteorological data for LGA, JFK and EWR.
# We want to visualize the distribution of a single variable with a histogram; 
# these are ideal for "continuous" variables with lots of values.
weather <- weather

Histograms

# We'll focus on temperatures ("temp") at these three airports
# Note: geometry for a histogram is "geom_histogram"
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram() 

Histograms

# Too clumpy; let's delineate the bars some more. Always use color = "white" option.
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(color = "white")

Histograms

# We can also adjust the "bins" or the width of the bars; more or less fine-grained
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(bins = 40, color = "white")
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(binwidth = 10, color = "white")

Histograms

# Rules for binwidth? The Freedman-Diaconis rule:
# binwwidth <- 2 * IQR / (N^(1/3))
skim(weather, temp)
bw <- (2 * (70.0-39.9)) / (26115^(1/3))

Histograms

  • Specify binwidth = bw
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(binwidth = bw, color = "white")

Histograms

# Facet wraps: These are very cool for subsetting by some grouping

ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(binwidth = 5, color = "white") +
  facet_wrap(~month)

Histograms

# We can also report y-axis in percentage terms; the third line uses 
# the "scales" package
# also, label axes, give title
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth = 2.029083, color = "white") +
  scale_y_continuous(labels=percent) +
  labs(x="Temperature", y="Percentage of Hours", title="Temperatures")

Histograms

# Now change labeling for y-axis values
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth = 2.029083, color = "white") +
  scale_y_continuous(limits = c(0, .06), breaks = seq(0, .06, by = .01), labels=percent) +
  labs(x="Temperature", y="Percentage of Hours", title="Temperatures")

Histograms

# Change theme for graph background
# Now change labeling for y-axis values
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth = 2.029083, color = "white") +
  scale_y_continuous(limits = c(0, .06), breaks = seq(0, .06, by = .01), labels=percent) +
  labs(x="Temperature", y="Percentage of Hours", title="Temperatures") +
  theme_minimal()

Histograms

# Center title
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(aes(y = (..count..)/sum(..count..)), binwidth = 2.029083, color = "white") +
  scale_y_continuous(limits = c(0, .06), breaks = seq(0, .06, by = .01), labels=percent) +
  labs(x="Temperature", y="Percentage of Hours", title="Temperatures") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

Note on Syntax

# There are some aspects of the syntax used above that are unnecessary and that we can
# eliminate. For example: 

# This command: 
ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram()
# is identical to this commmand (note that I've removed "data=" and "mapping="): 
ggplot(weather, aes(x = temp)) +
  geom_histogram()
# You actually don't even need "x=" but I like to include that to make it clear.
ggplot(weather, aes(temp)) +
  geom_histogram()
# From now on, we'll simplify to the second command

Density Plots

  • Univariate display summarizing the spread and distribution of a single interval variable.
  • It’s a “smoothed histogram.”
  • A graphical frequency distribution for an interval/continuous variable.

Density Plots

ggplot(weather, aes(x = temp)) +
  geom_density() +
  labs(x="Temperature", y="Density", title="Temperatures") +
  theme(plot.title = element_text(hjust = 0.5))

Density Plots

# Fill with color
ggplot(weather, aes(x = temp)) +
  geom_density(fill="dodgerblue") +
  labs(x="Temperature", y="Density", title="Temperatures") +
  theme(plot.title = element_text(hjust = 0.5))

Density Plots

# Make transparent with "alpha"
ggplot(weather, aes(x = temp)) +
  geom_density(fill="dodgerblue", alpha=.5) +
  labs(x="Temperature", y="Density", title="Temperatures") +
  theme(plot.title = element_text(hjust = 0.5))

Density Plots

# Ridge plots
# Notice I have to change month to "factor" variable. Most variables are "continuous"
# by default. I need to change this to "discrete" or ggplot can't graph it like 
# I want it to.
ggplot(weather, aes(x = temp, y=factor(month))) +
  geom_density_ridges(fill="dodgerblue", alpha=.5) +
  labs(x="Temperature", y="Month", title="Temperatures") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

Boxplots

# What do boxplots give us exactly? "Low", 25th pctile, 50th pctile (median), 75th pctile, 
# and "high"; and outliers.

# Let's again use the weather data: temperature 
# Geometry for boxplot is "geom_boxplot"
ggplot(weather, aes(y = temp)) +
  geom_boxplot() 

Boxplots

# Use more fine-grained y-axis labels
ggplot(weather, aes(y = temp)) +
  geom_boxplot() + 
  scale_y_continuous(limits = c(0, 100), breaks = seq(0, 100, by = 5))

Boxplots

# Now let's look at temp by month.
# We need to treat month as a "factor" or categorical variable instead of a continuous 
# variable. We'll talk more about this distinction later in the class.
ggplot(weather, aes(x = factor(month), y = temp)) +
  geom_boxplot() + 
  labs(x="Month", y="Temperature")

Barplots

  • Univariate display summarizing the spread and distribution of a single nominal or ordinal variable.
  • A graphical frequency distribution of a discrete (nominal or ordinal) variable.

Barplots

  • Now let’s look at temp by month.
  • We need to treat month as a “factor” or categorical variable instead of a continuous variable. We’ll talk more about this distinction later in the class.
# Call up flights data
flights <- flights

Barplots

# We could get a frequency distribution in table form using the 
# "freq" command from the questionr package.
freq(flights$carrier)
       n    % val%
9E 18460  5.5  5.5
AA 32729  9.7  9.7
AS   714  0.2  0.2
B6 54635 16.2 16.2
DL 48110 14.3 14.3
EV 54173 16.1 16.1
F9   685  0.2  0.2
FL  3260  1.0  1.0
HA   342  0.1  0.1
MQ 26397  7.8  7.8
OO    32  0.0  0.0
UA 58665 17.4 17.4
US 20536  6.1  6.1
VX  5162  1.5  1.5
WN 12275  3.6  3.6
YV   601  0.2  0.2

Barplots

# Now ggplot; geometry for bar graph is "geom_bar"
ggplot(flights, aes(x = carrier)) +
  geom_bar()

Barplots

# Stacked bar graphs; let's say you wanted to break down by origin (EWR, JFK, or LGA)
ggplot(flights, aes(x = carrier, fill = origin)) +
  geom_bar()

Barplots

# Grouped bar graphs; let's say you wanted to break down by origin (EWR, JFK, or LGA)
ggplot(flights, aes(x = carrier, fill = origin)) +
  geom_bar(position = "dodge")

Barplots

# We could also facet wrap
ggplot(flights, aes(x = carrier)) +
  geom_bar() +
  facet_wrap(~ origin, ncol = 1)

Barplots

# Note how y-axis is a count by default. We could report as a proportion instead.
ggplot(flights, aes(x = carrier)) +
  geom_bar(aes(y = (..count..)/sum(..count..)))

Barplots

# Percentage much more intuitive; the third line uses the package "scales", which 
# converts the proportion to a percentage and adds a percent sign.
ggplot(flights, aes(x = carrier)) +
  geom_bar(aes(y = (..count..)/sum(..count..))) +
  scale_y_continuous(labels=percent) 

Barplots

# Report y-axis labels as percentage; the third line uses the package "scales"; label axes
ggplot(flights, aes(x = carrier)) +
  geom_bar(aes(y = (..count..)/sum(..count..))) +
  scale_y_continuous(labels=percent) +
  labs(x="Carrier", y="Percentage")

Scatterplots

  • Bivariate display summarizing the association/relationship between X (independent variable) and Y (dependent variable).
  • Scatterplots ideal for interval/continuous X and Y variables.

Scatterplots

# Alaska data (from moderndive package); let's view it first
alaska_flights <- alaska_flights

# Variable descriptions
#?alaska_flights

Scatterplots

# Basic scatterplot; note how the plus sign separates different arguments.
# Note how the three elements of "grammar of graphics" are implemented with ggplot.
# The geometry name for a scatterplot is "geom_point"
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point()

Scatterplots

# We can add different features, which we'll do throughout the semester. 
# Let's add more informative labels for y and x axis
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point() +
  labs(x="Departure Delay", y="Arrival Delay")

Scatterplots

# Transparent dots and white background
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point(alpha = .2) +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

Scatterplots

# Make dots smaller
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point(size=.5, alpha=.2) +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

Scatterplots

# "Jitter" the points -- good option if there are many overlaps
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_jitter(width = 30, height = 30) +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

Scatterplots

# Color code by month
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay, color=factor(month))) +
  geom_jitter(width = 30, height = 30) +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

Scatterplots

# Let's add a "line of best fit" to the plot
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point() +
  geom_smooth(method="lm") +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

Scatterplots

# Take out the confidence interval (shading around line)
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point() +
  geom_smooth(method="lm", se=FALSE) +
  labs(x="Departure Delay", y="Arrival Delay") +
  theme_minimal()

Scatterplots

# Set limits and increments for y-axis labels
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point() +
  geom_smooth(method="lm", se=FALSE) +
  labs(x="Departure Delay", y="Arrival Delay") +
  scale_y_continuous(limits = c(-100, 200), breaks = seq(-100, 200, by = 50)) +
  theme_minimal()

Linegraphs

  • Bivariate plot of a dependent variable (Y) plotted against a time variable (X).
  • Shows the time series trend of a particular variable.
# We'll use the "early_january_weather" data from moderndive package
early_january_weather <- early_january_weather
#?early_january_weather

Linegraphs

# Let's plot temperatures ("temp") in Jan. against time ("time_hour")
# The geometry for a linegraph is "geom_line"
ggplot(early_january_weather, aes(x = time_hour, y = temp)) +
  geom_line()

Linegraphs

# Smoother to envision trend and other options. 
ggplot(early_january_weather, aes(x = time_hour, y = temp)) +
  geom_line() +
  geom_smooth(se=FALSE) +
  labs(x="Time/hour", y="Temperature") +
  scale_y_continuous(limits = c(0, 75), breaks = seq(0, 75, by = 10))